{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd # 导入Pandas模块进行数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n"
     ]
    }
   ],
   "source": [
    "# 读取爬取到的微博用户数据 并转换为Pandas模块中的DataFrame类型\n",
    "user_df = pd.read_csv(\"../data/user.csv\")\n",
    "# 打印类型\n",
    "print(type(user_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>gender</th>\n",
       "      <th>screen_name</th>\n",
       "      <th>followers_count</th>\n",
       "      <th>follow_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1002275340</td>\n",
       "      <td>f</td>\n",
       "      <td>迷糊滴日子</td>\n",
       "      <td>500</td>\n",
       "      <td>390</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1058856575</td>\n",
       "      <td>f</td>\n",
       "      <td>小愛俏丽嘛</td>\n",
       "      <td>338</td>\n",
       "      <td>131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1096883942</td>\n",
       "      <td>f</td>\n",
       "      <td>想念千里香</td>\n",
       "      <td>88</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1103099472</td>\n",
       "      <td>f</td>\n",
       "      <td>你在脸红吗S</td>\n",
       "      <td>811</td>\n",
       "      <td>371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1107885070</td>\n",
       "      <td>f</td>\n",
       "      <td>_一颗浆果_</td>\n",
       "      <td>98</td>\n",
       "      <td>238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1163424902</td>\n",
       "      <td>m</td>\n",
       "      <td>白兔菟菟</td>\n",
       "      <td>193</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1181284907</td>\n",
       "      <td>m</td>\n",
       "      <td>白服侠</td>\n",
       "      <td>567</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1218917552</td>\n",
       "      <td>m</td>\n",
       "      <td>布庶</td>\n",
       "      <td>2.3万</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1225172501</td>\n",
       "      <td>f</td>\n",
       "      <td>visajj</td>\n",
       "      <td>256</td>\n",
       "      <td>648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1226285361</td>\n",
       "      <td>f</td>\n",
       "      <td>嘉云wind_crystal</td>\n",
       "      <td>306</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           id gender     screen_name followers_count  follow_count\n",
       "0  1002275340      f           迷糊滴日子             500           390\n",
       "1  1058856575      f           小愛俏丽嘛             338           131\n",
       "2  1096883942      f           想念千里香              88           171\n",
       "3  1103099472      f          你在脸红吗S             811           371\n",
       "4  1107885070      f          _一颗浆果_              98           238\n",
       "5  1163424902      m            白兔菟菟             193           103\n",
       "6  1181284907      m             白服侠             567           333\n",
       "7  1218917552      m              布庶            2.3万           231\n",
       "8  1225172501      f          visajj             256           648\n",
       "9  1226285361      f  嘉云wind_crystal             306           275"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看user_df前10条数据\n",
    "user_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将列名转换为中文\n",
    "user_df = user_df.rename(columns={'id':'用户id'\n",
    "                        ,'gender':'性别'\n",
    "                        ,'screen_name':'用户名'\n",
    "                        ,'followers_count':'粉丝数'\n",
    "                        ,'follow_count':'关注数'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 486 entries, 0 to 485\n",
      "Data columns (total 5 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   用户id    486 non-null    int64 \n",
      " 1   性别      486 non-null    object\n",
      " 2   用户名     486 non-null    object\n",
      " 3   粉丝数     486 non-null    object\n",
      " 4   关注数     486 non-null    int64 \n",
      "dtypes: int64(2), object(3)\n",
      "memory usage: 19.1+ KB\n"
     ]
    }
   ],
   "source": [
    "# 查看每一列的信息\n",
    "# 可以发现 粉丝数 这一列的被解析成了 string（在pandas中的object表示str）类型\n",
    "# 数据中带有 '万' 字 故需要进行转换\n",
    "user_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4.860000e+02</td>\n",
       "      <td>486.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.121560e+09</td>\n",
       "      <td>446.732510</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.858855e+09</td>\n",
       "      <td>464.668145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.002275e+09</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>2.296542e+09</td>\n",
       "      <td>181.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.896263e+09</td>\n",
       "      <td>340.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>5.832447e+09</td>\n",
       "      <td>555.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>7.555550e+09</td>\n",
       "      <td>4768.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               用户id          关注数\n",
       "count  4.860000e+02   486.000000\n",
       "mean   4.121560e+09   446.732510\n",
       "std    1.858855e+09   464.668145\n",
       "min    1.002275e+09     5.000000\n",
       "25%    2.296542e+09   181.000000\n",
       "50%    3.896263e+09   340.500000\n",
       "75%    5.832447e+09   555.750000\n",
       "max    7.555550e+09  4768.000000"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看user_df 数值列的基本统计信息：最大值、最小值、标准差、平均值等\n",
    "user_df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "用户id    0\n",
       "性别      0\n",
       "用户名     0\n",
       "粉丝数     0\n",
       "关注数     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看每一列是否有空数据\n",
    "user_df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 按用户id对数据去重\n",
    "user_df = user_df.drop_duplicates('用户id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>性别</th>\n",
       "      <th>用户名</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1002275340</td>\n",
       "      <td>f</td>\n",
       "      <td>迷糊滴日子</td>\n",
       "      <td>500</td>\n",
       "      <td>390</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1058856575</td>\n",
       "      <td>f</td>\n",
       "      <td>小愛俏丽嘛</td>\n",
       "      <td>338</td>\n",
       "      <td>131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1096883942</td>\n",
       "      <td>f</td>\n",
       "      <td>想念千里香</td>\n",
       "      <td>88</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1103099472</td>\n",
       "      <td>f</td>\n",
       "      <td>你在脸红吗S</td>\n",
       "      <td>811</td>\n",
       "      <td>371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1107885070</td>\n",
       "      <td>f</td>\n",
       "      <td>_一颗浆果_</td>\n",
       "      <td>98</td>\n",
       "      <td>238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1163424902</td>\n",
       "      <td>m</td>\n",
       "      <td>白兔菟菟</td>\n",
       "      <td>193</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1181284907</td>\n",
       "      <td>m</td>\n",
       "      <td>白服侠</td>\n",
       "      <td>567</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1218917552</td>\n",
       "      <td>m</td>\n",
       "      <td>布庶</td>\n",
       "      <td>2.3万</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1225172501</td>\n",
       "      <td>f</td>\n",
       "      <td>visajj</td>\n",
       "      <td>256</td>\n",
       "      <td>648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1226285361</td>\n",
       "      <td>f</td>\n",
       "      <td>嘉云wind_crystal</td>\n",
       "      <td>306</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         用户id 性别             用户名   粉丝数  关注数\n",
       "0  1002275340  f           迷糊滴日子   500  390\n",
       "1  1058856575  f           小愛俏丽嘛   338  131\n",
       "2  1096883942  f           想念千里香    88  171\n",
       "3  1103099472  f          你在脸红吗S   811  371\n",
       "4  1107885070  f          _一颗浆果_    98  238\n",
       "5  1163424902  m            白兔菟菟   193  103\n",
       "6  1181284907  m             白服侠   567  333\n",
       "7  1218917552  m              布庶  2.3万  231\n",
       "8  1225172501  f          visajj   256  648\n",
       "9  1226285361  f  嘉云wind_crystal   306  275"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看去重后的结果\n",
    "user_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 数据的问题：\n",
    "# 1、性别列 取值为 m、f 可将其转换为 男、女\n",
    "# 2、粉丝数列 单位不统一 需进行转换\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>性别</th>\n",
       "      <th>用户名</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1002275340</td>\n",
       "      <td>f</td>\n",
       "      <td>迷糊滴日子</td>\n",
       "      <td>500</td>\n",
       "      <td>390</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1058856575</td>\n",
       "      <td>f</td>\n",
       "      <td>小愛俏丽嘛</td>\n",
       "      <td>338</td>\n",
       "      <td>131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1096883942</td>\n",
       "      <td>f</td>\n",
       "      <td>想念千里香</td>\n",
       "      <td>88</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1103099472</td>\n",
       "      <td>f</td>\n",
       "      <td>你在脸红吗S</td>\n",
       "      <td>811</td>\n",
       "      <td>371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1107885070</td>\n",
       "      <td>f</td>\n",
       "      <td>_一颗浆果_</td>\n",
       "      <td>98</td>\n",
       "      <td>238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1225172501</td>\n",
       "      <td>f</td>\n",
       "      <td>visajj</td>\n",
       "      <td>256</td>\n",
       "      <td>648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1226285361</td>\n",
       "      <td>f</td>\n",
       "      <td>嘉云wind_crystal</td>\n",
       "      <td>306</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1260721485</td>\n",
       "      <td>f</td>\n",
       "      <td>玥玥玥玥玥野兔</td>\n",
       "      <td>436</td>\n",
       "      <td>141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1269668000</td>\n",
       "      <td>f</td>\n",
       "      <td>lindado</td>\n",
       "      <td>558</td>\n",
       "      <td>979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1306484861</td>\n",
       "      <td>f</td>\n",
       "      <td>大胖大胖</td>\n",
       "      <td>454.6万</td>\n",
       "      <td>1042</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          用户id 性别             用户名     粉丝数   关注数\n",
       "0   1002275340  f           迷糊滴日子     500   390\n",
       "1   1058856575  f           小愛俏丽嘛     338   131\n",
       "2   1096883942  f           想念千里香      88   171\n",
       "3   1103099472  f          你在脸红吗S     811   371\n",
       "4   1107885070  f          _一颗浆果_      98   238\n",
       "8   1225172501  f          visajj     256   648\n",
       "9   1226285361  f  嘉云wind_crystal     306   275\n",
       "10  1260721485  f         玥玥玥玥玥野兔     436   141\n",
       "11  1269668000  f         lindado     558   979\n",
       "12  1306484861  f            大胖大胖  454.6万  1042"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将性别取值换为 男、女\n",
    "\n",
    "# 将女性用户取出来\n",
    "female_user_df = user_df[user_df['性别'] == 'f']\n",
    "# 查看数据\n",
    "female_user_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将女性用户的性别取值改为 女\n",
    "user_df.loc[female_user_df.index,'性别'] = '女'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>性别</th>\n",
       "      <th>用户名</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1002275340</td>\n",
       "      <td>女</td>\n",
       "      <td>迷糊滴日子</td>\n",
       "      <td>500</td>\n",
       "      <td>390</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1058856575</td>\n",
       "      <td>女</td>\n",
       "      <td>小愛俏丽嘛</td>\n",
       "      <td>338</td>\n",
       "      <td>131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1096883942</td>\n",
       "      <td>女</td>\n",
       "      <td>想念千里香</td>\n",
       "      <td>88</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1103099472</td>\n",
       "      <td>女</td>\n",
       "      <td>你在脸红吗S</td>\n",
       "      <td>811</td>\n",
       "      <td>371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1107885070</td>\n",
       "      <td>女</td>\n",
       "      <td>_一颗浆果_</td>\n",
       "      <td>98</td>\n",
       "      <td>238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1163424902</td>\n",
       "      <td>m</td>\n",
       "      <td>白兔菟菟</td>\n",
       "      <td>193</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1181284907</td>\n",
       "      <td>m</td>\n",
       "      <td>白服侠</td>\n",
       "      <td>567</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1218917552</td>\n",
       "      <td>m</td>\n",
       "      <td>布庶</td>\n",
       "      <td>2.3万</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1225172501</td>\n",
       "      <td>女</td>\n",
       "      <td>visajj</td>\n",
       "      <td>256</td>\n",
       "      <td>648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1226285361</td>\n",
       "      <td>女</td>\n",
       "      <td>嘉云wind_crystal</td>\n",
       "      <td>306</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         用户id 性别             用户名   粉丝数  关注数\n",
       "0  1002275340  女           迷糊滴日子   500  390\n",
       "1  1058856575  女           小愛俏丽嘛   338  131\n",
       "2  1096883942  女           想念千里香    88  171\n",
       "3  1103099472  女          你在脸红吗S   811  371\n",
       "4  1107885070  女          _一颗浆果_    98  238\n",
       "5  1163424902  m            白兔菟菟   193  103\n",
       "6  1181284907  m             白服侠   567  333\n",
       "7  1218917552  m              布庶  2.3万  231\n",
       "8  1225172501  女          visajj   256  648\n",
       "9  1226285361  女  嘉云wind_crystal   306  275"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看修改后的结果\n",
    "user_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>性别</th>\n",
       "      <th>用户名</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1163424902</td>\n",
       "      <td>m</td>\n",
       "      <td>白兔菟菟</td>\n",
       "      <td>193</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1181284907</td>\n",
       "      <td>m</td>\n",
       "      <td>白服侠</td>\n",
       "      <td>567</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1218917552</td>\n",
       "      <td>m</td>\n",
       "      <td>布庶</td>\n",
       "      <td>2.3万</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1417860852</td>\n",
       "      <td>m</td>\n",
       "      <td>摄影师arkey007</td>\n",
       "      <td>104.3万</td>\n",
       "      <td>884</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>1659273132</td>\n",
       "      <td>m</td>\n",
       "      <td>综艺小娱哥</td>\n",
       "      <td>23.6万</td>\n",
       "      <td>224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>1662641957</td>\n",
       "      <td>m</td>\n",
       "      <td>seeyoon</td>\n",
       "      <td>1.8万</td>\n",
       "      <td>2090</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>1738932247</td>\n",
       "      <td>m</td>\n",
       "      <td>我的损友是个极品</td>\n",
       "      <td>2683.6万</td>\n",
       "      <td>478</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>1750070171</td>\n",
       "      <td>m</td>\n",
       "      <td>36氪</td>\n",
       "      <td>336.6万</td>\n",
       "      <td>998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>1751798254</td>\n",
       "      <td>m</td>\n",
       "      <td>徐小渣</td>\n",
       "      <td>979</td>\n",
       "      <td>429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>1812453397</td>\n",
       "      <td>m</td>\n",
       "      <td>要飞的猪儿</td>\n",
       "      <td>214</td>\n",
       "      <td>166</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          用户id 性别          用户名      粉丝数   关注数\n",
       "5   1163424902  m         白兔菟菟      193   103\n",
       "6   1181284907  m          白服侠      567   333\n",
       "7   1218917552  m           布庶     2.3万   231\n",
       "14  1417860852  m  摄影师arkey007   104.3万   884\n",
       "28  1659273132  m        综艺小娱哥    23.6万   224\n",
       "29  1662641957  m      seeyoon     1.8万  2090\n",
       "38  1738932247  m     我的损友是个极品  2683.6万   478\n",
       "45  1750070171  m          36氪   336.6万   998\n",
       "47  1751798254  m          徐小渣      979   429\n",
       "62  1812453397  m        要飞的猪儿      214   166"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将男性用户取出来\n",
    "male_user_df = user_df[user_df['性别'] == 'm']\n",
    "# 查看数据\n",
    "male_user_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将男性用户的性别取值改为 女\n",
    "user_df.loc[male_user_df.index,'性别'] = '男'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>性别</th>\n",
       "      <th>用户名</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1002275340</td>\n",
       "      <td>女</td>\n",
       "      <td>迷糊滴日子</td>\n",
       "      <td>500</td>\n",
       "      <td>390</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1058856575</td>\n",
       "      <td>女</td>\n",
       "      <td>小愛俏丽嘛</td>\n",
       "      <td>338</td>\n",
       "      <td>131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1096883942</td>\n",
       "      <td>女</td>\n",
       "      <td>想念千里香</td>\n",
       "      <td>88</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1103099472</td>\n",
       "      <td>女</td>\n",
       "      <td>你在脸红吗S</td>\n",
       "      <td>811</td>\n",
       "      <td>371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1107885070</td>\n",
       "      <td>女</td>\n",
       "      <td>_一颗浆果_</td>\n",
       "      <td>98</td>\n",
       "      <td>238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1163424902</td>\n",
       "      <td>男</td>\n",
       "      <td>白兔菟菟</td>\n",
       "      <td>193</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1181284907</td>\n",
       "      <td>男</td>\n",
       "      <td>白服侠</td>\n",
       "      <td>567</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1218917552</td>\n",
       "      <td>男</td>\n",
       "      <td>布庶</td>\n",
       "      <td>2.3万</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1225172501</td>\n",
       "      <td>女</td>\n",
       "      <td>visajj</td>\n",
       "      <td>256</td>\n",
       "      <td>648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1226285361</td>\n",
       "      <td>女</td>\n",
       "      <td>嘉云wind_crystal</td>\n",
       "      <td>306</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         用户id 性别             用户名   粉丝数  关注数\n",
       "0  1002275340  女           迷糊滴日子   500  390\n",
       "1  1058856575  女           小愛俏丽嘛   338  131\n",
       "2  1096883942  女           想念千里香    88  171\n",
       "3  1103099472  女          你在脸红吗S   811  371\n",
       "4  1107885070  女          _一颗浆果_    98  238\n",
       "5  1163424902  男            白兔菟菟   193  103\n",
       "6  1181284907  男             白服侠   567  333\n",
       "7  1218917552  男              布庶  2.3万  231\n",
       "8  1225172501  女          visajj   256  648\n",
       "9  1226285361  女  嘉云wind_crystal   306  275"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看修改后的结果\n",
    "user_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>性别</th>\n",
       "      <th>用户名</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1218917552</td>\n",
       "      <td>男</td>\n",
       "      <td>布庶</td>\n",
       "      <td>2.3万</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1306484861</td>\n",
       "      <td>女</td>\n",
       "      <td>大胖大胖</td>\n",
       "      <td>454.6万</td>\n",
       "      <td>1042</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1417860852</td>\n",
       "      <td>男</td>\n",
       "      <td>摄影师arkey007</td>\n",
       "      <td>104.3万</td>\n",
       "      <td>884</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1549362863</td>\n",
       "      <td>女</td>\n",
       "      <td>张大奕eve</td>\n",
       "      <td>1222.8万</td>\n",
       "      <td>873</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1596931083</td>\n",
       "      <td>女</td>\n",
       "      <td>帕丽扎提Parissa耶</td>\n",
       "      <td>401.4万</td>\n",
       "      <td>367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>1659273132</td>\n",
       "      <td>男</td>\n",
       "      <td>综艺小娱哥</td>\n",
       "      <td>23.6万</td>\n",
       "      <td>224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>1662641957</td>\n",
       "      <td>男</td>\n",
       "      <td>seeyoon</td>\n",
       "      <td>1.8万</td>\n",
       "      <td>2090</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>1680599054</td>\n",
       "      <td>女</td>\n",
       "      <td>姐有料</td>\n",
       "      <td>208.7万</td>\n",
       "      <td>151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>1733927074</td>\n",
       "      <td>女</td>\n",
       "      <td>bearbybear熊熊</td>\n",
       "      <td>38.9万</td>\n",
       "      <td>283</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>1738932247</td>\n",
       "      <td>男</td>\n",
       "      <td>我的损友是个极品</td>\n",
       "      <td>2683.6万</td>\n",
       "      <td>478</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          用户id 性别           用户名      粉丝数   关注数\n",
       "7   1218917552  男            布庶     2.3万   231\n",
       "12  1306484861  女          大胖大胖   454.6万  1042\n",
       "14  1417860852  男   摄影师arkey007   104.3万   884\n",
       "18  1549362863  女        张大奕eve  1222.8万   873\n",
       "22  1596931083  女  帕丽扎提Parissa耶   401.4万   367\n",
       "28  1659273132  男         综艺小娱哥    23.6万   224\n",
       "29  1662641957  男       seeyoon     1.8万  2090\n",
       "30  1680599054  女           姐有料   208.7万   151\n",
       "37  1733927074  女  bearbybear熊熊    38.9万   283\n",
       "38  1738932247  男      我的损友是个极品  2683.6万   478"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 对 粉丝数 进行转换\n",
    "\n",
    "# 取出值中包含 万 的数据\n",
    "user_df_wan = user_df[user_df['粉丝数'].str.contains('万')]\n",
    "# 查看数据\n",
    "user_df_wan.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7        23000\n",
       "12     4546000\n",
       "14     1043000\n",
       "18    12228000\n",
       "22     4014000\n",
       "28      236000\n",
       "29       18000\n",
       "30     2087000\n",
       "37      389000\n",
       "38    26836000\n",
       "Name: 粉丝数, dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将粉丝数包含 万 的值转换为10进制\n",
    "user_df_wan_trans = user_df_wan['粉丝数'].apply(lambda x:int(float(x[:-1])*10000))\n",
    "# 查看转换结果\n",
    "user_df_wan_trans.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将转换后的结果 对原数据进行替换\n",
    "user_df.loc[user_df_wan.index,'粉丝数'] = user_df_wan_trans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户id</th>\n",
       "      <th>性别</th>\n",
       "      <th>用户名</th>\n",
       "      <th>粉丝数</th>\n",
       "      <th>关注数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1002275340</td>\n",
       "      <td>女</td>\n",
       "      <td>迷糊滴日子</td>\n",
       "      <td>500</td>\n",
       "      <td>390</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1058856575</td>\n",
       "      <td>女</td>\n",
       "      <td>小愛俏丽嘛</td>\n",
       "      <td>338</td>\n",
       "      <td>131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1096883942</td>\n",
       "      <td>女</td>\n",
       "      <td>想念千里香</td>\n",
       "      <td>88</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1103099472</td>\n",
       "      <td>女</td>\n",
       "      <td>你在脸红吗S</td>\n",
       "      <td>811</td>\n",
       "      <td>371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1107885070</td>\n",
       "      <td>女</td>\n",
       "      <td>_一颗浆果_</td>\n",
       "      <td>98</td>\n",
       "      <td>238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1163424902</td>\n",
       "      <td>男</td>\n",
       "      <td>白兔菟菟</td>\n",
       "      <td>193</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1181284907</td>\n",
       "      <td>男</td>\n",
       "      <td>白服侠</td>\n",
       "      <td>567</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1218917552</td>\n",
       "      <td>男</td>\n",
       "      <td>布庶</td>\n",
       "      <td>23000</td>\n",
       "      <td>231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1225172501</td>\n",
       "      <td>女</td>\n",
       "      <td>visajj</td>\n",
       "      <td>256</td>\n",
       "      <td>648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1226285361</td>\n",
       "      <td>女</td>\n",
       "      <td>嘉云wind_crystal</td>\n",
       "      <td>306</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         用户id 性别             用户名    粉丝数  关注数\n",
       "0  1002275340  女           迷糊滴日子    500  390\n",
       "1  1058856575  女           小愛俏丽嘛    338  131\n",
       "2  1096883942  女           想念千里香     88  171\n",
       "3  1103099472  女          你在脸红吗S    811  371\n",
       "4  1107885070  女          _一颗浆果_     98  238\n",
       "5  1163424902  男            白兔菟菟    193  103\n",
       "6  1181284907  男             白服侠    567  333\n",
       "7  1218917552  男              布庶  23000  231\n",
       "8  1225172501  女          visajj    256  648\n",
       "9  1226285361  女  嘉云wind_crystal    306  275"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看替换后的结果\n",
    "user_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 粉丝数列的类型转换为int\n",
    "user_df['粉丝数'] = user_df['粉丝数'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 486 entries, 0 to 485\n",
      "Data columns (total 5 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   用户id    486 non-null    int64 \n",
      " 1   性别      486 non-null    object\n",
      " 2   用户名     486 non-null    object\n",
      " 3   粉丝数     486 non-null    int64 \n",
      " 4   关注数     486 non-null    int64 \n",
      "dtypes: int64(3), object(2)\n",
      "memory usage: 42.8+ KB\n"
     ]
    }
   ],
   "source": [
    "# 查看每一列的信息\n",
    "user_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 用户数据 清洗后的结果保存\n",
    "user_df.to_csv(\"../data/user_clean.csv\",index=None,encoding='utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n"
     ]
    }
   ],
   "source": [
    "# 读取爬取到的微博数据 并转换为Pandas模块中的DataFrame类型\n",
    "weibo_df = pd.read_csv(\"../data/weiboarticle.csv\")\n",
    "# 打印类型\n",
    "print(type(weibo_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comments_count</th>\n",
       "      <th>created_at</th>\n",
       "      <th>source</th>\n",
       "      <th>reposts_count</th>\n",
       "      <th>attitudes_count</th>\n",
       "      <th>text</th>\n",
       "      <th>user_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4698762426975702</td>\n",
       "      <td>122</td>\n",
       "      <td>Mon Nov 01 16:35:24 +0800 2021</td>\n",
       "      <td>见置顶微博Android</td>\n",
       "      <td>144</td>\n",
       "      <td>1236</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/search?container...</td>\n",
       "      <td>5991749877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4699091725977118</td>\n",
       "      <td>561</td>\n",
       "      <td>Tue Nov 02 14:23:55 +0800 2021</td>\n",
       "      <td>张大奕eve的小店</td>\n",
       "      <td>396</td>\n",
       "      <td>633</td>\n",
       "      <td>我们黑牌今晚21：00定时抢现货～～&lt;br /&gt;消费 top10 送 iPhone13！！&lt;...</td>\n",
       "      <td>1549362863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4699111129090066</td>\n",
       "      <td>1022</td>\n",
       "      <td>Tue Nov 02 15:41:01 +0800 2021</td>\n",
       "      <td>微博视频号</td>\n",
       "      <td>1008</td>\n",
       "      <td>3055</td>\n",
       "      <td>这就是男同学的噩梦吗&lt;span class=\"\"url-icon\"\"&gt;&lt;img alt=[允...</td>\n",
       "      <td>1738932247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4699120227059627</td>\n",
       "      <td>7</td>\n",
       "      <td>Tue Nov 02 16:17:10 +0800 2021</td>\n",
       "      <td>剑网3超话</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/p/index?extparam...</td>\n",
       "      <td>5999050245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4699120453291260</td>\n",
       "      <td>71</td>\n",
       "      <td>Tue Nov 02 16:18:04 +0800 2021</td>\n",
       "      <td>微博 weibo.com</td>\n",
       "      <td>26</td>\n",
       "      <td>60</td>\n",
       "      <td>[赢牛奶]上海臻景医美专场&amp;amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...</td>\n",
       "      <td>2845185161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4699122840634527</td>\n",
       "      <td>32</td>\n",
       "      <td>Tue Nov 02 16:27:33 +0800 2021</td>\n",
       "      <td>新版微博 weibo.com</td>\n",
       "      <td>17</td>\n",
       "      <td>96</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/search?container...</td>\n",
       "      <td>6421583580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>4699127131146614</td>\n",
       "      <td>22</td>\n",
       "      <td>Tue Nov 02 16:44:36 +0800 2021</td>\n",
       "      <td>严浩翔超话</td>\n",
       "      <td>0</td>\n",
       "      <td>29</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/p/index?extparam...</td>\n",
       "      <td>6466957523</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>4699137528824076</td>\n",
       "      <td>41</td>\n",
       "      <td>Tue Nov 02 17:25:54 +0800 2021</td>\n",
       "      <td>乎然捡来的iPhone 12</td>\n",
       "      <td>1</td>\n",
       "      <td>22</td>\n",
       "      <td>准备出一期爱用护肤分享&lt;br /&gt;把之前瓶瓶罐罐都翻出来，发现这几年用了好多啊&lt;span c...</td>\n",
       "      <td>6858679534</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>4699152058156405</td>\n",
       "      <td>24</td>\n",
       "      <td>Tue Nov 02 18:23:39 +0800 2021</td>\n",
       "      <td>HarmonyOS设备</td>\n",
       "      <td>3</td>\n",
       "      <td>64</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/search?container...</td>\n",
       "      <td>6084203417</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>4699176469008979</td>\n",
       "      <td>663</td>\n",
       "      <td>Tue Nov 02 20:00:39 +0800 2021</td>\n",
       "      <td>iPhone客户端</td>\n",
       "      <td>172</td>\n",
       "      <td>3280</td>\n",
       "      <td>- 分享饰品 -&lt;br /&gt;&lt;br /&gt;&lt;a  href=\"\"https://m.weibo...</td>\n",
       "      <td>5884957483</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id  comments_count                      created_at  \\\n",
       "0  4698762426975702             122  Mon Nov 01 16:35:24 +0800 2021   \n",
       "1  4699091725977118             561  Tue Nov 02 14:23:55 +0800 2021   \n",
       "2  4699111129090066            1022  Tue Nov 02 15:41:01 +0800 2021   \n",
       "3  4699120227059627               7  Tue Nov 02 16:17:10 +0800 2021   \n",
       "4  4699120453291260              71  Tue Nov 02 16:18:04 +0800 2021   \n",
       "5  4699122840634527              32  Tue Nov 02 16:27:33 +0800 2021   \n",
       "6  4699127131146614              22  Tue Nov 02 16:44:36 +0800 2021   \n",
       "7  4699137528824076              41  Tue Nov 02 17:25:54 +0800 2021   \n",
       "8  4699152058156405              24  Tue Nov 02 18:23:39 +0800 2021   \n",
       "9  4699176469008979             663  Tue Nov 02 20:00:39 +0800 2021   \n",
       "\n",
       "           source  reposts_count  attitudes_count  \\\n",
       "0    见置顶微博Android            144             1236   \n",
       "1       张大奕eve的小店            396              633   \n",
       "2           微博视频号           1008             3055   \n",
       "3           剑网3超话              1               35   \n",
       "4    微博 weibo.com             26               60   \n",
       "5  新版微博 weibo.com             17               96   \n",
       "6           严浩翔超话              0               29   \n",
       "7  乎然捡来的iPhone 12              1               22   \n",
       "8     HarmonyOS设备              3               64   \n",
       "9       iPhone客户端            172             3280   \n",
       "\n",
       "                                                text     user_id  \n",
       "0  <a  href=\"\"https://m.weibo.cn/search?container...  5991749877  \n",
       "1  我们黑牌今晚21：00定时抢现货～～<br />消费 top10 送 iPhone13！！<...  1549362863  \n",
       "2  这就是男同学的噩梦吗<span class=\"\"url-icon\"\"><img alt=[允...  1738932247  \n",
       "3  <a  href=\"\"https://m.weibo.cn/p/index?extparam...  5999050245  \n",
       "4  [赢牛奶]上海臻景医美专场&amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...  2845185161  \n",
       "5  <a  href=\"\"https://m.weibo.cn/search?container...  6421583580  \n",
       "6  <a  href=\"\"https://m.weibo.cn/p/index?extparam...  6466957523  \n",
       "7  准备出一期爱用护肤分享<br />把之前瓶瓶罐罐都翻出来，发现这几年用了好多啊<span c...  6858679534  \n",
       "8  <a  href=\"\"https://m.weibo.cn/search?container...  6084203417  \n",
       "9  - 分享饰品 -<br /><br /><a  href=\"\"https://m.weibo...  5884957483  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看weibo_df前10条数据\n",
    "weibo_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将列名转换为中文\n",
    "weibo_df = weibo_df.rename(columns={\n",
    "    'id':'微博id',\n",
    "    'comments_count':'评论数',\n",
    "    'created_at':'发布时间',\n",
    "    'source':'来源',\n",
    "    'reposts_count':'转发数',\n",
    "    'attitudes_count':'点赞数',\n",
    "    'text':'微博文章',\n",
    "    'uesr_id':'用户id'\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 418 entries, 0 to 417\n",
      "Data columns (total 8 columns):\n",
      " #   Column   Non-Null Count  Dtype \n",
      "---  ------   --------------  ----- \n",
      " 0   微博id     418 non-null    int64 \n",
      " 1   评论数      418 non-null    int64 \n",
      " 2   发布时间     418 non-null    object\n",
      " 3   来源       389 non-null    object\n",
      " 4   转发数      418 non-null    int64 \n",
      " 5   点赞数      418 non-null    int64 \n",
      " 6   微博文章     418 non-null    object\n",
      " 7   user_id  418 non-null    int64 \n",
      "dtypes: int64(5), object(3)\n",
      "memory usage: 26.2+ KB\n"
     ]
    }
   ],
   "source": [
    "# 查看每一列的信息\n",
    "weibo_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>微博id</th>\n",
       "      <th>评论数</th>\n",
       "      <th>转发数</th>\n",
       "      <th>点赞数</th>\n",
       "      <th>user_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4.180000e+02</td>\n",
       "      <td>418.000000</td>\n",
       "      <td>418.000000</td>\n",
       "      <td>418.000000</td>\n",
       "      <td>4.180000e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.699232e+15</td>\n",
       "      <td>8.607656</td>\n",
       "      <td>4.449761</td>\n",
       "      <td>25.191388</td>\n",
       "      <td>4.126569e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.845556e+10</td>\n",
       "      <td>67.187159</td>\n",
       "      <td>54.031420</td>\n",
       "      <td>230.672943</td>\n",
       "      <td>1.848186e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>4.698762e+15</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.058857e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>4.699233e+15</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.339875e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>4.699237e+15</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.874717e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>4.699239e+15</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>5.845042e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>4.699242e+15</td>\n",
       "      <td>1022.000000</td>\n",
       "      <td>1008.000000</td>\n",
       "      <td>3280.000000</td>\n",
       "      <td>7.555550e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               微博id          评论数          转发数          点赞数       user_id\n",
       "count  4.180000e+02   418.000000   418.000000   418.000000  4.180000e+02\n",
       "mean   4.699232e+15     8.607656     4.449761    25.191388  4.126569e+09\n",
       "std    2.845556e+10    67.187159    54.031420   230.672943  1.848186e+09\n",
       "min    4.698762e+15     0.000000     0.000000     0.000000  1.058857e+09\n",
       "25%    4.699233e+15     0.000000     0.000000     0.000000  2.339875e+09\n",
       "50%    4.699237e+15     0.000000     0.000000     0.000000  3.874717e+09\n",
       "75%    4.699239e+15     1.000000     0.000000     1.000000  5.845042e+09\n",
       "max    4.699242e+15  1022.000000  1008.000000  3280.000000  7.555550e+09"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看 数值型列的基本统计信息：最大值、最小值、平均数、标准差等\n",
    "weibo_df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "微博id        0\n",
       "评论数         0\n",
       "发布时间        0\n",
       "来源         29\n",
       "转发数         0\n",
       "点赞数         0\n",
       "微博文章        0\n",
       "user_id     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看每一列是否有空数据\n",
    "weibo_df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 通过观察分析发现，数据中存在以下问题:\n",
    "# 1、发布时间 为Python中标准的日期格式 不利于后续分析 可转换为 yyyy-mm-dd HH:MM:ss的格式\n",
    "# 2、来源 存在空数据 可用 默认值 替代\n",
    "# 3、微博文章 存在大量html标签 可剔除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对 发布时间 进行转换\n",
    "\n",
    "# 使用pandas提供的to_datetime方法可将其转换为datetime类型\n",
    "weibo_df['发布时间'] = pd.to_datetime(weibo_df['发布时间'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>微博id</th>\n",
       "      <th>评论数</th>\n",
       "      <th>发布时间</th>\n",
       "      <th>来源</th>\n",
       "      <th>转发数</th>\n",
       "      <th>点赞数</th>\n",
       "      <th>微博文章</th>\n",
       "      <th>user_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4698762426975702</td>\n",
       "      <td>122</td>\n",
       "      <td>2021-11-01 16:35:24+08:00</td>\n",
       "      <td>见置顶微博Android</td>\n",
       "      <td>144</td>\n",
       "      <td>1236</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/search?container...</td>\n",
       "      <td>5991749877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4699091725977118</td>\n",
       "      <td>561</td>\n",
       "      <td>2021-11-02 14:23:55+08:00</td>\n",
       "      <td>张大奕eve的小店</td>\n",
       "      <td>396</td>\n",
       "      <td>633</td>\n",
       "      <td>我们黑牌今晚21：00定时抢现货～～&lt;br /&gt;消费 top10 送 iPhone13！！&lt;...</td>\n",
       "      <td>1549362863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4699111129090066</td>\n",
       "      <td>1022</td>\n",
       "      <td>2021-11-02 15:41:01+08:00</td>\n",
       "      <td>微博视频号</td>\n",
       "      <td>1008</td>\n",
       "      <td>3055</td>\n",
       "      <td>这就是男同学的噩梦吗&lt;span class=\"\"url-icon\"\"&gt;&lt;img alt=[允...</td>\n",
       "      <td>1738932247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4699120227059627</td>\n",
       "      <td>7</td>\n",
       "      <td>2021-11-02 16:17:10+08:00</td>\n",
       "      <td>剑网3超话</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/p/index?extparam...</td>\n",
       "      <td>5999050245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4699120453291260</td>\n",
       "      <td>71</td>\n",
       "      <td>2021-11-02 16:18:04+08:00</td>\n",
       "      <td>微博 weibo.com</td>\n",
       "      <td>26</td>\n",
       "      <td>60</td>\n",
       "      <td>[赢牛奶]上海臻景医美专场&amp;amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...</td>\n",
       "      <td>2845185161</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               微博id   评论数                      发布时间            来源   转发数   点赞数  \\\n",
       "0  4698762426975702   122 2021-11-01 16:35:24+08:00  见置顶微博Android   144  1236   \n",
       "1  4699091725977118   561 2021-11-02 14:23:55+08:00     张大奕eve的小店   396   633   \n",
       "2  4699111129090066  1022 2021-11-02 15:41:01+08:00         微博视频号  1008  3055   \n",
       "3  4699120227059627     7 2021-11-02 16:17:10+08:00         剑网3超话     1    35   \n",
       "4  4699120453291260    71 2021-11-02 16:18:04+08:00  微博 weibo.com    26    60   \n",
       "\n",
       "                                                微博文章     user_id  \n",
       "0  <a  href=\"\"https://m.weibo.cn/search?container...  5991749877  \n",
       "1  我们黑牌今晚21：00定时抢现货～～<br />消费 top10 送 iPhone13！！<...  1549362863  \n",
       "2  这就是男同学的噩梦吗<span class=\"\"url-icon\"\"><img alt=[允...  1738932247  \n",
       "3  <a  href=\"\"https://m.weibo.cn/p/index?extparam...  5999050245  \n",
       "4  [赢牛奶]上海臻景医美专场&amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...  2845185161  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "weibo_df.head() # 查看转换后的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "# 将发布时间格式化成 yyyy-mm-dd HH:MM:ss的格式\n",
    "weibo_df['发布时间'] = weibo_df['发布时间'].apply(lambda x:datetime.strftime(x,\"%Y-%m-%d %H:%M:%S\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 来源 存在空数据 可用 默认值 替代\n",
    "# 用 \"未知来源\" 代替 空值\n",
    "weibo_source_na = weibo_df['来源'].fillna(\"未知来源\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对原始数据进行修改\n",
    "weibo_df.loc[weibo_source_na.index,'来源'] = weibo_source_na"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>微博id</th>\n",
       "      <th>评论数</th>\n",
       "      <th>发布时间</th>\n",
       "      <th>来源</th>\n",
       "      <th>转发数</th>\n",
       "      <th>点赞数</th>\n",
       "      <th>微博文章</th>\n",
       "      <th>user_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4698762426975702</td>\n",
       "      <td>122</td>\n",
       "      <td>2021-11-01 16:35:24</td>\n",
       "      <td>见置顶微博Android</td>\n",
       "      <td>144</td>\n",
       "      <td>1236</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/search?container...</td>\n",
       "      <td>5991749877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4699091725977118</td>\n",
       "      <td>561</td>\n",
       "      <td>2021-11-02 14:23:55</td>\n",
       "      <td>张大奕eve的小店</td>\n",
       "      <td>396</td>\n",
       "      <td>633</td>\n",
       "      <td>我们黑牌今晚21：00定时抢现货～～&lt;br /&gt;消费 top10 送 iPhone13！！&lt;...</td>\n",
       "      <td>1549362863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4699111129090066</td>\n",
       "      <td>1022</td>\n",
       "      <td>2021-11-02 15:41:01</td>\n",
       "      <td>微博视频号</td>\n",
       "      <td>1008</td>\n",
       "      <td>3055</td>\n",
       "      <td>这就是男同学的噩梦吗&lt;span class=\"\"url-icon\"\"&gt;&lt;img alt=[允...</td>\n",
       "      <td>1738932247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4699120227059627</td>\n",
       "      <td>7</td>\n",
       "      <td>2021-11-02 16:17:10</td>\n",
       "      <td>剑网3超话</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "      <td>&lt;a  href=\"\"https://m.weibo.cn/p/index?extparam...</td>\n",
       "      <td>5999050245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4699120453291260</td>\n",
       "      <td>71</td>\n",
       "      <td>2021-11-02 16:18:04</td>\n",
       "      <td>微博 weibo.com</td>\n",
       "      <td>26</td>\n",
       "      <td>60</td>\n",
       "      <td>[赢牛奶]上海臻景医美专场&amp;amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...</td>\n",
       "      <td>2845185161</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               微博id   评论数                 发布时间            来源   转发数   点赞数  \\\n",
       "0  4698762426975702   122  2021-11-01 16:35:24  见置顶微博Android   144  1236   \n",
       "1  4699091725977118   561  2021-11-02 14:23:55     张大奕eve的小店   396   633   \n",
       "2  4699111129090066  1022  2021-11-02 15:41:01         微博视频号  1008  3055   \n",
       "3  4699120227059627     7  2021-11-02 16:17:10         剑网3超话     1    35   \n",
       "4  4699120453291260    71  2021-11-02 16:18:04  微博 weibo.com    26    60   \n",
       "\n",
       "                                                微博文章     user_id  \n",
       "0  <a  href=\"\"https://m.weibo.cn/search?container...  5991749877  \n",
       "1  我们黑牌今晚21：00定时抢现货～～<br />消费 top10 送 iPhone13！！<...  1549362863  \n",
       "2  这就是男同学的噩梦吗<span class=\"\"url-icon\"\"><img alt=[允...  1738932247  \n",
       "3  <a  href=\"\"https://m.weibo.cn/p/index?extparam...  5999050245  \n",
       "4  [赢牛奶]上海臻景医美专场&amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...  2845185161  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "weibo_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "微博id       0\n",
       "评论数        0\n",
       "发布时间       0\n",
       "来源         0\n",
       "转发数        0\n",
       "点赞数        0\n",
       "微博文章       0\n",
       "user_id    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 再次查看每一列是否存在空值\n",
    "weibo_df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 微博文章 存在大量html标签 可剔除\n",
    "# 使用正则表达式替换\n",
    "weibo_df['微博文章'] = weibo_df['微博文章'].replace({r'<[^>]+>':''},regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>微博id</th>\n",
       "      <th>评论数</th>\n",
       "      <th>发布时间</th>\n",
       "      <th>来源</th>\n",
       "      <th>转发数</th>\n",
       "      <th>点赞数</th>\n",
       "      <th>微博文章</th>\n",
       "      <th>user_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4698762426975702</td>\n",
       "      <td>122</td>\n",
       "      <td>2021-11-01 16:35:24</td>\n",
       "      <td>见置顶微博Android</td>\n",
       "      <td>144</td>\n",
       "      <td>1236</td>\n",
       "      <td>#京东双十一组队#又是一年双十一，每年都在提醒自己做好#双十一攻略# ，只有这样，抄作业才更...</td>\n",
       "      <td>5991749877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4699091725977118</td>\n",
       "      <td>561</td>\n",
       "      <td>2021-11-02 14:23:55</td>\n",
       "      <td>张大奕eve的小店</td>\n",
       "      <td>396</td>\n",
       "      <td>633</td>\n",
       "      <td>我们黑牌今晚21：00定时抢现货～～消费 top10 送 iPhone13！！转赞评里 我捞...</td>\n",
       "      <td>1549362863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4699111129090066</td>\n",
       "      <td>1022</td>\n",
       "      <td>2021-11-02 15:41:01</td>\n",
       "      <td>微博视频号</td>\n",
       "      <td>1008</td>\n",
       "      <td>3055</td>\n",
       "      <td>这就是男同学的噩梦吗#双十一# 我的损友是个极品的微博视频</td>\n",
       "      <td>1738932247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4699120227059627</td>\n",
       "      <td>7</td>\n",
       "      <td>2021-11-02 16:17:10</td>\n",
       "      <td>剑网3超话</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "      <td>剑网3剑网3趣事投稿，关于我亲友评价双十一冲销挂件</td>\n",
       "      <td>5999050245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4699120453291260</td>\n",
       "      <td>71</td>\n",
       "      <td>2021-11-02 16:18:04</td>\n",
       "      <td>微博 weibo.com</td>\n",
       "      <td>26</td>\n",
       "      <td>60</td>\n",
       "      <td>[赢牛奶]上海臻景医美专场&amp;amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...</td>\n",
       "      <td>2845185161</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               微博id   评论数                 发布时间            来源   转发数   点赞数  \\\n",
       "0  4698762426975702   122  2021-11-01 16:35:24  见置顶微博Android   144  1236   \n",
       "1  4699091725977118   561  2021-11-02 14:23:55     张大奕eve的小店   396   633   \n",
       "2  4699111129090066  1022  2021-11-02 15:41:01         微博视频号  1008  3055   \n",
       "3  4699120227059627     7  2021-11-02 16:17:10         剑网3超话     1    35   \n",
       "4  4699120453291260    71  2021-11-02 16:18:04  微博 weibo.com    26    60   \n",
       "\n",
       "                                                微博文章     user_id  \n",
       "0  #京东双十一组队#又是一年双十一，每年都在提醒自己做好#双十一攻略# ，只有这样，抄作业才更...  5991749877  \n",
       "1  我们黑牌今晚21：00定时抢现货～～消费 top10 送 iPhone13！！转赞评里 我捞...  1549362863  \n",
       "2                     这就是男同学的噩梦吗#双十一# 我的损友是个极品的微博视频   1738932247  \n",
       "3                         剑网3剑网3趣事投稿，关于我亲友评价双十一冲销挂件   5999050245  \n",
       "4  [赢牛奶]上海臻景医美专场&amp;双十一盛大活动就在眼前了！！！这次真的准备了百万级赠礼哦...  2845185161  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看最终修改后的结果\n",
    "weibo_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 微博数据 清洗后的结果保存\n",
    "weibo_df.to_csv(\"../data/weibo_clean.csv\",index=None,encoding='utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
